** this file reads in the bilateral distances
** and cleans them, saves to the restat folder


* I am going to use distw_harmonic
* according to the file "Even though this second formula [for distw_harmonic
* 	is less intuitive, distw_harmonic is a more appropriate measure than distw_arithmetic
* 	since the empirical estimates of the distance elastiicty of trade flows
*	are unambiguously closer to -1 than to 1.

* if i need more variables add back in later
* update 02/03/2023 - adding in other distance measures after discussion with Pol + Teresa
* also added in GDP

* update 02/15/2023 - add in trade agreements

* update 04/20/2023 - add in common language, legal system, contiguous

* update 05/08/2023 - clean UN regions file


 **Set directories
cd ""  /* PROJECT ROOT FOLDER */

/* DEFINE GLOBALS PATHS HERE */

global data "data"
global temp work
global distances "data/distances"
global input_country "input/country_data_input"

use $input_country/Gravity_V202211_abb.dta, clear


keep if year == 2007
keep if country_exists_o == 1 & country_exists_d == 1

keep iso3_o iso3_d distw_harmonic dist distcap distw_arithmetic gdp_o gdp_d fta_wto rta_type contig comlang_off comleg_pretrans

ren distw_harmonic dist_harm
ren distw_arithmetic dist_arith
ren dist dist_simple
ren distcap dist_cap

gen fta = 0
replace fta = 1 if (fta_wto != 0 & fta_wto != .) | (rta_type > 0 & rta_type != .)
drop fta_wto rta_type

* some silly fix
replace fta = 1 if iso3_o == iso3_d


save $distances/distance_mat.dta, replace


** save a separate copy of US as destination to make subsequent code more efficient
keep if iso3_d == "USA"

ren dist_harm dist_harm_USA
ren dist_arith dist_arith_USA
ren dist_simple dist_simple_USA
ren dist_cap dist_cap_USA

save $distances/distance_mat_USA.dta, replace



* clean UN data
import excel using "$input_country/UN_country_regions.xlsx", clear firstrow

keep ISOalpha3Code SubregionCode SubregionName

ren ISOalpha3Code iso3
ren SubregionCode region

* collapsing micronesia, melanesia, polynesia with aus + new zealand
replace region = 53 if region == 54 | region == 57 | region == 61

* combining northern and western europe
replace region = 155 if region == 154

* drop SubregionName


save "$data/un_country_regions_abb.dta", replace



use $data/country_data_unique, clear
ren region region_old
merge 1:1 iso3 using "$data/un_country_regions_abb.dta"

* dropping countries not in the old dataset
drop if _merge ==2

* fixing merge == 1, mainly just TWN
replace region = 30 if iso3 == "TWN"
replace SubregionName = "Eastern Asia" if iso3 == "TWN"

replace region = 39 if iso3 == "KSV"
replace SubregionName = "Southern Europe" if iso3 == "KSV"

drop region_det region_old regioncode
drop _merge

save $data/country_data_unique_un, replace

